In [3]:
import pandas as pd
import plotly.express as px
# Load the dataset
df = pd.read_csv("C:/Users/Kadeem Green/Downloads/Adult.txt")
# Select categorical variables that are meaningful
categorical_columns = ["workclass", "education", "marital-status", "occupation", "relationship"]
# Ensure the selected columns exist in the dataset
cat_vars = df[categorical_columns]
# Compute proportions (percentage of most frequent category for each variable)
cat_summary = cat_vars.apply(lambda x: x.value_counts(normalize=True).max() * 100)
# Convert to DataFrame for radar chart
df_radar = pd.DataFrame(dict(
r=cat_summary.values, # Proportions as percentages
theta=cat_summary.index # Category names
))
# Generate radar chart using Plotly
fig = px.line_polar(df_radar, r='r', theta='theta', line_close=True,
title="Radar Chart of Categorical Variables in Adult Dataset")
# Show the plot
fig.show()
In [4]:
#Question 6. Construct a web graph (or spider graph) of the categorical variables. Fine-tune the graph so that interesting results emerge. Discuss your findings.
import pandas as pd
import plotly.express as px
# Load the dataset
df = pd.read_csv("C:/Users/Kadeem Green/Downloads/Adult.txt")
# Select categorical variables of interest
categorical_columns = ["workclass", "education", "marital-status", "occupation", "relationship"]
# Generate radar charts for each categorical variable
for category in categorical_columns:
# Compute category proportions (percentage of occurrences)
cat_counts = df[category].value_counts(normalize=True) * 100
# Create DataFrame for radar chart
df_radar = pd.DataFrame(dict(
r=cat_counts.values, # Percentage values
theta=cat_counts.index # Category names
))
# Generate radar chart
fig = px.line_polar(df_radar, r='r', theta='theta', line_close=True,
title=f"Radar Chart for {category.capitalize()}",
markers=True)
# Show the plot
fig.show()
In [1]:
#Question 9. Construct a histogram of each numerical variables, with an overlay of the target variable income.
Normalize if necessary.
import pandas as pd
import plotly.express as px
# Load the dataset
df = pd.read_csv("C:/Users/Kadeem Green/Downloads/Adult.txt")
# Select numerical variables (excluding target variable 'income')
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
target_variable = "income"
# Check if the target variable exists in the dataset
if target_variable not in df.columns:
raise ValueError(f"Target variable '{target_variable}' not found in dataset!")
# Normalize numerical variables (optional)
df_normalized = df.copy()
for col in numerical_columns:
df_normalized[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
# Generate histograms for each numerical variable with income overlay
for col in numerical_columns:
fig = px.histogram(df, x=col, color=target_variable,
title=f"Histogram of {col} with Income Overlay",
nbins=30, barmode="overlay", opacity=0.7)
fig.show()
In [2]:
#Question 10 For each pair of numerical variables, construct a scatter plot of the variables. Discuss your salient results.
import pandas as pd
import plotly.express as px
import itertools # To generate variable pairs
# Load dataset
df = pd.read_csv("C:/Users/Kadeem Green/Downloads/Adult.txt")
# Select numerical variables
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
# Generate scatter plots for each pair of numerical variables
for var1, var2 in itertools.combinations(numerical_columns, 2): # Creates unique pairs of numerical variables
fig = px.scatter(df, x=var1, y=var2,
title=f"Scatter Plot of {var1} vs {var2}",
labels={var1: var1, var2: var2},
color="income", # Overlay income for comparison
opacity=0.7)
fig.show()
In [ ]: